Scott Cole
8 May 2017
This notebook covers how to use python to analyze the relationship between multiple input variables and one output variable using multiple linear regression.
Outline:
We will be using the library statsmodels to run our linear regression
In [1]:
import numpy as np
import statsmodels.formula.api as smf
import pandas as pd
import scipy as sp
%matplotlib notebook
%config InlineBackend.figure_format = 'retina'
%matplotlib inline
import matplotlib.pyplot as plt
In [2]:
# Define true statistics relating x and y
N_points = 10
true_beta0 = 0
true_beta1 = 2
noise_stdev = 1
# Set random seed
np.random.seed(0)
# Generate correlated data
x = np.random.randn(N_points) + 2
y = true_beta0 + true_beta1*x + np.random.randn(N_points)*noise_stdev
print('x=', x)
print('y=', y)
In [3]:
# Plot x and y
plt.figure(figsize=(4,4))
plt.plot(x, y, 'k.', ms=12)
plt.xlabel('x',size=15)
plt.ylabel('y',size=15)
Out[3]:
In [4]:
# Fit line to data
A = np.vstack([x, np.ones(len(x))]).T
m, b = np.linalg.lstsq(A, y)[0]
print('True statistics: y =', true_beta1, '*x +', true_beta0)
print('Estimated stats: y =', m, '*x +', b)
print('R squared (fraction of variance explained) =',np.round(sp.stats.pearsonr(x,y)[0],2))
In [5]:
# Plot fitted line
plt.figure(figsize=(4,4))
plt.plot(x, y, 'k.', ms=12)
plt.plot([0,5], [true_beta1*x+true_beta0 for x in [0,5]], 'k--',label='True correlation')
plt.plot([0,5], [m*x+b for x in [0,5]], 'r--',label='Estimated correlation')
plt.xlabel('x',size=15)
plt.ylabel('y',size=15)
plt.xlim((0,5))
plt.legend(loc='best')
Out[5]:
In [6]:
# Simulate data with non-normal distribution of error
np.random.seed(1)
N_points = 100
x = np.random.randn(N_points) + 2
y = true_beta0 + true_beta1*x + np.random.randn(N_points)**2
In [7]:
# Fit line to data
A = np.vstack([x, np.ones(len(x))]).T
m, b = np.linalg.lstsq(A, y)[0]
print('True statistics: y =', true_beta1, '*x +', true_beta0)
print('Estimated stats: y =', m, '*x +', b)
print('R squared (fraction of variance explained) =',np.round(sp.stats.pearsonr(x,y)[0],2))
In [8]:
# Plot fitted line
plt.figure(figsize=(4,4))
plt.plot(x, y, 'k.', ms=8)
plt.plot([0,5], [true_beta1*x+true_beta0 for x in [0,5]], 'k--',label='True correlation')
plt.plot([0,5], [m*x+b for x in [0,5]], 'r--', label='Estimated correlation')
plt.xlabel('x',size=15)
plt.ylabel('y',size=15)
plt.xlim((0,5))
plt.legend(loc='best')
Out[8]:
In [9]:
plt.figure(figsize=(8,3))
errors = y - [m*xi+b for xi in x]
hist2 = plt.hist(np.random.randn(100000)*np.std(errors),np.arange(-8,8,.5),color='r', normed=True, alpha=.5,label='normal')
hist = plt.hist(errors,np.arange(-8,8,.5),color='k', normed=True, alpha=.3,label='True error')
plt.legend(loc='best')
plt.xlabel('Estimate error')
plt.ylabel('Probability')
plt.yticks(np.arange(0,1.2,.2),np.arange(0,.6,.1))
Out[9]:
Linear regression works well only if samples are independent and identically distributed (IID). If this assumption is violated, then the computed correlation statistics are not reliable.
In [10]:
# Burrito information
np.random.seed(0)
burrito1_cost = 6 + np.random.randn(50)
burrito1_stars = 3.5 + np.random.randn(50)*.8
burrito_new_cost = 4
burrito_new_stars = np.arange(4,5.1,.1)
# Define cost and stars arrays
c = np.append(np.ones(len(burrito_new_stars))*burrito_new_cost,burrito1_cost)
s = np.append(burrito_new_stars,burrito1_stars)
# Compute correlation
print('Statistics of random data points')
print('R squared (fraction of variance explained) =',np.round(sp.stats.pearsonr(c,s)[0]**2,2))
print('p =',np.round(sp.stats.pearsonr(c,s)[1],3))
print('\nStatistics after adding in 10 non-independent data points')
print('R squared (fraction of variance explained) =',np.round(sp.stats.pearsonr(burrito1_cost, burrito1_stars)[0],2))
print('p =',np.round(sp.stats.pearsonr(burrito1_cost, burrito1_stars)[1],3))
In [11]:
# Fit line to data
A = np.vstack([c, np.ones(len(c))]).T
m, b = np.linalg.lstsq(A, s)[0]
# Plot fitted line
plt.figure(figsize=(4,4))
plt.plot(c, s, 'k.', ms=8)
plt.plot([0,10], [m*x+b for x in [0,10]], 'k--')
plt.xlabel('Burrito cost')
plt.ylabel('Stars')
plt.xlim((0,10))
Out[11]:
In [12]:
# Load burrito data into pandas dataframe
url = 'https://docs.google.com/spreadsheet/ccc?key=18HkrklYz1bKpDLeL-kaMrGjAhUM6LeJMIACwEljCgaw&output=csv'
df = pd.read_csv(url)
# Delete unreliable ratings
import pandasql
df.Unreliable = df.Unreliable.map({'x':1,'X':1,1:1})
df.Unreliable = df.Unreliable.fillna(0)
q = """SELECT * FROM df WHERE unreliable == 0"""
df = pandasql.sqldf(q.lower(), locals())
# Rename meat:filling column because statsmodels sucks
df.rename(columns={'Meat:filling': 'Meatratio'}, inplace=True)
# Limit data to main features
df = df[['Location','Burrito','Yelp','Cost','Hunger', 'Volume', 'Tortilla', 'Temp', 'Meat',
'Fillings', 'Meatratio', 'Uniformity', 'Salsa', 'Synergy', 'Wrap', 'overall']]
df.tail()
Out[12]:
In [13]:
# Define dimensions of interest
dims = ['Cost', 'Hunger', 'Tortilla', 'Temp', 'Meat',
'Fillings', 'Meatratio', 'Uniformity', 'Salsa', 'Wrap']
# Correlate each dimension to the overall satisfaction rating
results = {}
for d in dims:
model_str = 'overall ~ ' + d
results[d] = smf.ols(model_str, data=df, missing='drop').fit()
print(d,', R2 =',results[d].rsquared, ', p =',np.round(results[d].pvalues[d],4))
In [14]:
plt.plot(df['Fillings'],df['overall'],'k.')
plt.xlabel('Nonmeat filling flavor')
plt.ylabel('overall satisfaction')
Out[14]:
In [15]:
model_str = 'overall ~ ' + ' + '.join(dims)
print(model_str)
In [16]:
results_all = smf.ols(model_str, data=df, missing='drop').fit()
print(results_all.summary())
In [17]:
dims = ['Cost','Hunger', 'Tortilla', 'Temp', 'Meat',
'Fillings', 'Meatratio', 'Uniformity', 'Salsa', 'Synergy', 'Wrap']
model_str = 'overall ~ ' + ' + '.join(dims)
results_all = smf.ols(model_str, data=df, missing='drop').fit()
print(results_all.summary())
In [18]:
dfcorr = df[dims].corr()
M = len(dims)
from matplotlib import cm
clim1 = (-1,1)
plt.figure(figsize=(12,10))
cax = plt.pcolor(range(M+1), range(M+1), dfcorr, cmap=cm.bwr)
cbar = plt.colorbar(cax, ticks=(-1,-.5,0,.5,1))
cbar.ax.set_ylabel('Pearson correlation (r)', size=30)
plt.clim(clim1)
cbar.ax.set_yticklabels((-1,-.5,0,.5,1),size=20)
ax = plt.gca()
ax.set_yticks(np.arange(M)+.5)
ax.set_yticklabels(dims,size=25)
ax.set_xticks(np.arange(M)+.5)
ax.set_xticklabels(dims,size=25)
plt.xticks(rotation='vertical')
plt.tight_layout()
plt.xlim((0,M))
plt.ylim((0,M))
Out[18]:
In [19]:
y = np.arange(1,2,.01)
x1 = y + np.random.randn(len(y))*.1
x2 = x1 + np.random.randn(len(y))*.3
In [20]:
plt.figure(figsize=(8,4))
plt.subplot(1,2,1)
plt.plot(x1,y,'k.')
plt.ylabel('y')
plt.xlabel('x1')
plt.subplot(1,2,2)
plt.plot(x2,y,'k.')
plt.xlabel('x2')
plt.figure(figsize=(4,4))
plt.plot(x1,x2,'k.')
plt.xlabel('x1')
plt.ylabel('x2')
Out[20]:
In [21]:
print('Correlation coefficient between x1 and y: ', np.round(sp.stats.pearsonr(x1,y)[0],3))
print('Correlation coefficient between x2 and y: ', np.round(sp.stats.pearsonr(x2,y)[0],3))
print('Correlation coefficient between x1 and x2: ', np.round(sp.stats.pearsonr(x1,x2)[0],3))
In [22]:
# Regress out features
def regress_out(x, y):
"""Regress x out of y to get a new y value"""
A = np.vstack([x, np.ones(len(x))]).T
m, b = np.linalg.lstsq(A, y)[0]
return y - b - x*m
x2b = regress_out(x1, x2)
In [23]:
# Visualize relationships with x2 after regressing out x1
plt.figure(figsize=(4,7))
plt.subplot(2,1,1)
plt.plot(x2b,x1,'k.')
plt.ylabel('x1')
plt.subplot(2,1,2)
plt.plot(x2b,y,'k.')
plt.ylabel('y')
plt.xlabel('x2 after regressing out x1')
Out[23]:
In [24]:
print('After regressing out x1 from x2:')
print('Correlation coefficient between x2 and y: ', np.round(sp.stats.pearsonr(x2b,y)[0],3))
print('Correlation coefficient between x1 and x2: ', np.round(sp.stats.pearsonr(x1,x2b)[0],3))
In [25]:
df = pd.DataFrame.from_dict({'x1':x1, 'x2':x2, 'y':y})
results_all = smf.ols('y ~ x1 + x2', data=df).fit()
print(results_all.summary())
In [ ]: